Date importing and first Inspection¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
In [2]:
# os
folder_path = "./plots"

if not os.path.exists(folder_path) :
    os.makedirs(folder_path)
In [3]:
df=pd.read_csv("housing.csv")
In [4]:
df
Out[4]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND

20640 rows × 10 columns

Features :¶

longitude : geographic coordinate ( district east-west postion)¶

latitude : geographic coordinat (district north-south postion)¶

total_rooms : sum of all room in district¶

population : total population in district¶

median_income : median household income in district¶

median_house_value :median house value in district¶

houseing_median_age : median age of houses in district¶

ocean_proximity : district's proximity to the ocean¶

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [6]:
# there is missing value in total_bedrooms
# all the columns floats exept one column( ocean_proximity) is object
In [7]:
# show the null value
df[df.total_bedrooms.isna()]
Out[7]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
290 -122.16 37.77 47.0 1256.0 NaN 570.0 218.0 4.3750 161900.0 NEAR BAY
341 -122.17 37.75 38.0 992.0 NaN 732.0 259.0 1.6196 85100.0 NEAR BAY
538 -122.28 37.78 29.0 5154.0 NaN 3741.0 1273.0 2.5762 173400.0 NEAR BAY
563 -122.24 37.75 45.0 891.0 NaN 384.0 146.0 4.9489 247100.0 NEAR BAY
696 -122.10 37.69 41.0 746.0 NaN 387.0 161.0 3.9063 178400.0 NEAR BAY
... ... ... ... ... ... ... ... ... ... ...
20267 -119.19 34.20 18.0 3620.0 NaN 3171.0 779.0 3.3409 220500.0 NEAR OCEAN
20268 -119.18 34.19 19.0 2393.0 NaN 1938.0 762.0 1.6953 167400.0 NEAR OCEAN
20372 -118.88 34.17 15.0 4260.0 NaN 1701.0 669.0 5.1033 410700.0 <1H OCEAN
20460 -118.75 34.29 17.0 5512.0 NaN 2734.0 814.0 6.6073 258100.0 <1H OCEAN
20484 -118.72 34.28 17.0 3051.0 NaN 1705.0 495.0 5.7376 218600.0 <1H OCEAN

207 rows × 10 columns

In [8]:
# check if there any duplicate rows
df[df.duplicated()]
Out[8]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
In [9]:
df.describe()
Out[9]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
In [10]:
df.describe(include="O")
Out[10]:
ocean_proximity
count 20640
unique 5
top <1H OCEAN
freq 9136
In [11]:
df.ocean_proximity.unique()
Out[11]:
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)
In [12]:
# NEAR BAY  قرب الخليج
# <1H OCEAN تبعد عن المحيط اقل من ساعة
# INLAND داخل البلاد
# NEAR OCEAN قرب المحيط
# ISLAND جزيرة
In [13]:
df.ocean_proximity.value_counts()
Out[13]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
In [14]:
df.hist(bins=50,figsize=(12,12))
plot_filename = os.path.join(folder_path , "my_plot.png")
plt.savefig(plot_filename)
plt.show()

Data Cleaning and create additional features¶

In [15]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [16]:
# dron missing value
df.dropna(inplace=True)

Add New Features¶

Room per Household¶

In [17]:
df["rooms_per_household"]=df.total_rooms.div(df.households)
In [18]:
df
Out[18]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity rooms_per_household
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY 6.984127
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY 6.238137
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY 8.288136
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY 5.817352
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY 6.281853
... ... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND 5.045455
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND 6.114035
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND 5.205543
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND 5.329513
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND 5.254717

20433 rows × 11 columns

In [19]:
df.rooms_per_household.nlargest(10)
Out[19]:
1914     141.909091
1979     132.533333
12447     62.422222
1913      61.812500
11862     59.875000
1912      56.269231
9676      52.848214
11707     52.690476
2395      50.837838
1240      47.515152
Name: rooms_per_household, dtype: float64
In [20]:
df.rooms_per_household.nsmallest(10)
Out[20]:
5916     0.846154
8219     0.888889
3126     1.000000
14818    1.130435
17820    1.130435
4552     1.260870
4550     1.378486
4587     1.411290
4602     1.465753
12484    1.550409
Name: rooms_per_household, dtype: float64
In [21]:
df.loc[[5916,8219,1914,1979]]
Out[21]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity rooms_per_household
5916 -118.44 34.28 46.0 11.0 11.0 24.0 13.0 2.8750 162500.0 <1H OCEAN 0.846154
8219 -118.21 33.79 33.0 32.0 18.0 96.0 36.0 4.5938 112500.0 NEAR OCEAN 0.888889
1914 -120.10 38.91 33.0 1561.0 282.0 30.0 11.0 1.8750 500001.0 INLAND 141.909091
1979 -120.08 38.80 34.0 1988.0 511.0 36.0 15.0 4.6250 162500.0 INLAND 132.533333

Population per Household¶

In [22]:
df["pop_per_household"]=df.population.div(df.households)

Bedroom per Rooms¶

In [23]:
df["bedrooms_per_room"]=df.total_bedrooms.div(df.total_rooms)
In [24]:
df
Out[24]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity rooms_per_household pop_per_household bedrooms_per_room
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY 6.984127 2.555556 0.146591
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY 6.238137 2.109842 0.155797
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY 8.288136 2.802260 0.129516
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY 5.817352 2.547945 0.184458
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY 6.281853 2.181467 0.172096
... ... ... ... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND 5.045455 2.560606 0.224625
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND 6.114035 3.122807 0.215208
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND 5.205543 2.325635 0.215173
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND 5.329513 2.123209 0.219892
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND 5.254717 2.616981 0.221185

20433 rows × 13 columns

In [25]:
df.describe()
Out[25]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value rooms_per_household pop_per_household bedrooms_per_room
count 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000 20433.000000
mean -119.570689 35.633221 28.633094 2636.504233 537.870553 1424.946949 499.433465 3.871162 206864.413155 5.431344 3.071533 0.213039
std 2.003578 2.136348 12.591805 2185.269567 421.385070 1133.208490 382.299226 1.899291 115435.667099 2.482946 10.438269 0.057983
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000 0.846154 0.692308 0.100000
25% -121.800000 33.930000 18.000000 1450.000000 296.000000 787.000000 280.000000 2.563700 119500.000000 4.441441 2.429032 0.175427
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.536500 179700.000000 5.230769 2.817582 0.203162
75% -118.010000 37.720000 37.000000 3143.000000 647.000000 1722.000000 604.000000 4.744000 264700.000000 6.052381 3.281513 0.239821
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000 141.909091 1243.333333 1.000000

Explanatory Data Analysis¶

which factors influence house prices?¶

In [26]:
df
Out[26]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity rooms_per_household pop_per_household bedrooms_per_room
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY 6.984127 2.555556 0.146591
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY 6.238137 2.109842 0.155797
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY 8.288136 2.802260 0.129516
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY 5.817352 2.547945 0.184458
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY 6.281853 2.181467 0.172096
... ... ... ... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND 5.045455 2.560606 0.224625
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND 6.114035 3.122807 0.215208
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND 5.205543 2.325635 0.215173
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND 5.329513 2.123209 0.219892
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND 5.254717 2.616981 0.221185

20433 rows × 13 columns

In [28]:
df.median_house_value.hist(bins=100,figsize=(12,8))
path_filename=os.path.join(folder_path,"hist_median_house_value.png")
plt.savefig(path_filename)
plt.show()
In [29]:
# show the corrolation between the columns
df.corr()
C:\Users\fadia\AppData\Local\Temp\ipykernel_38812\4113669569.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr()
Out[29]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value rooms_per_household pop_per_household bedrooms_per_room
longitude 1.000000 -0.924616 -0.109357 0.045480 0.069608 0.100270 0.056513 -0.015550 -0.045398 -0.027307 0.002304 0.092657
latitude -0.924616 1.000000 0.011899 -0.036667 -0.066983 -0.108997 -0.071774 -0.079626 -0.144638 0.106423 0.002522 -0.113815
housing_median_age -0.109357 0.011899 1.000000 -0.360628 -0.320451 -0.295787 -0.302768 -0.118278 0.106432 -0.153031 0.013258 0.136089
total_rooms 0.045480 -0.036667 -0.360628 1.000000 0.930380 0.857281 0.918992 0.197882 0.133294 0.133482 -0.024596 -0.187900
total_bedrooms 0.069608 -0.066983 -0.320451 0.930380 1.000000 0.877747 0.979728 -0.007723 0.049686 0.001538 -0.028355 0.084238
population 0.100270 -0.108997 -0.295787 0.857281 0.877747 1.000000 0.907186 0.005087 -0.025300 -0.071898 0.070062 0.035319
households 0.056513 -0.071774 -0.302768 0.918992 0.979728 0.907186 1.000000 0.013434 0.064894 -0.080165 -0.027336 0.065087
median_income -0.015550 -0.079626 -0.118278 0.197882 -0.007723 0.005087 0.013434 1.000000 0.688355 0.325307 0.018894 -0.615661
median_house_value -0.045398 -0.144638 0.106432 0.133294 0.049686 -0.025300 0.064894 0.688355 1.000000 0.151344 -0.023639 -0.255880
rooms_per_household -0.027307 0.106423 -0.153031 0.133482 0.001538 -0.071898 -0.080165 0.325307 0.151344 1.000000 -0.004873 -0.416952
pop_per_household 0.002304 0.002522 0.013258 -0.024596 -0.028355 0.070062 -0.027336 0.018894 -0.023639 -0.004873 1.000000 0.002938
bedrooms_per_room 0.092657 -0.113815 0.136089 -0.187900 0.084238 0.035319 0.065087 -0.615661 -0.255880 -0.416952 0.002938 1.000000
In [35]:
# just we need the correlation between the median house value and the all columns
#  sort the correlation from the postive corrlation to negative correlation
df.corr(numeric_only=True).median_house_value.sort_values(ascending=False)
Out[35]:
median_house_value     1.000000
median_income          0.688355
rooms_per_household    0.151344
total_rooms            0.133294
housing_median_age     0.106432
households             0.064894
total_bedrooms         0.049686
pop_per_household     -0.023639
population            -0.025300
longitude             -0.045398
latitude              -0.144638
bedrooms_per_room     -0.255880
Name: median_house_value, dtype: float64
In [37]:
# we can see there is high correlation between median_income and median_house_value
# this mean higher income it will be  higher house value
df.median_income.hist(bins=100,figsize=(12,6))
plt.show()
In [38]:
# regression plot betwwen median_income and median_house_value
# scatter plot with linear Regresssion and histgram for each one
sns.set(font_scale=1.5)
sns.jointplot(data=df,x="median_income",y="median_house_value",kind="reg",height=10)
plt.show()
In [40]:
# plot with kernel density estimator  مقدر كثافة النواة
sns.set(font_scale=1.5)
sns.jointplot(data=df,x="median_income",y="median_house_value",kind="kde",height=10)
plt.show()
In [45]:
df.plot(kind="scatter",x="longitude",y="latitude",
       s=df.population/100,label="Population",figsize=(15,10),
       c="median_house_value",cmap="coolwarm",
       colorbar=True,alpha=0.4,fontsize=15,sharex=False)
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("Longitude",fontsize=14)
plt.legend(fontsize=16)
plt.show()
In [47]:
import matplotlib.image as mpimg
california_img = mpimg.imread("california.png")
In [48]:
california_img
Out[48]:
array([[[0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        ...,
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ]],

       [[0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        ...,
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ]],

       [[0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        ...,
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ]],

       ...,

       [[0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        ...,
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ]],

       [[0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        ...,
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ]],

       [[0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        [0.627451 , 0.7764706, 0.9137255, 1.       ],
        ...,
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ],
        [0.9647059, 0.827451 , 0.6666667, 1.       ]]], dtype=float32)
In [49]:
plt.figure(figsize=(15,10))
plt.imshow(california_img)
plt.show()
In [50]:
plt.figure(figsize=(15,10))
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05])
plt.show()
In [60]:
df.plot(kind="scatter",x="longitude",y="latitude",
       s=df.population/100,c="median_house_value",cmap="coolwarm",
        alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
       )
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
path_filename=os.path.join(folder_path,"california_plot.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
In [62]:
prox= df.ocean_proximity.unique()
prox
Out[62]:
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

Plot the data depend on ocean proximity¶

In [67]:
df_loc_near_bay= df[df.ocean_proximity == prox[0]].copy()
df_loc_less_one_hour= df[df.ocean_proximity == prox[1]].copy()
df_loc_inland= df[df.ocean_proximity == prox[2]].copy()
df_loc_near_ocean= df[df.ocean_proximity == prox[3]].copy()
df_loc_island= df[df.ocean_proximity == prox[4]].copy()

Naer Bay¶

In [79]:
df_loc_near_bay.plot(kind="scatter",x="longitude",y="latitude",
       s=df_loc_near_bay["population"]/100,c="median_house_value",cmap="coolwarm",
        alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
       )
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Near Bay")
path_filename=os.path.join(folder_path,"california_plot_near_bay.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()

<1Houre¶

In [80]:
df_loc_less_one_hour.plot(kind="scatter",x="longitude",y="latitude",
       s=df_loc_less_one_hour["population"]/100,c="median_house_value",cmap="coolwarm",
        alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
       )
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by <1 Houre")
path_filename=os.path.join(folder_path,"california_plot_less_one_houre.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()

Inland¶

In [81]:
df_loc_inland.plot(kind="scatter",x="longitude",y="latitude",
       s=df_loc_inland["population"]/100,c="median_house_value",cmap="coolwarm",
        alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
       )
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Inland")
path_filename=os.path.join(folder_path,"california_plot_inland.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()

Near Ocean¶

In [82]:
df_loc_near_ocean.plot(kind="scatter",x="longitude",y="latitude",
       s=df_loc_near_ocean["population"]/100,c="median_house_value",cmap="coolwarm",
        alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
       )
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Near Ocean")
path_filename=os.path.join(folder_path,"california_plot_near_ocean.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()

Island¶

In [83]:
df_loc_island.plot(kind="scatter",x="longitude",y="latitude",
       s=df_loc_island["population"]/100,c="median_house_value",cmap="coolwarm",
        alpha=0.4,sharex=False,colorbar=True,figsize=(15,10),label="Population"
       )
plt.imshow(california_img,extent=[-124.55,-113.80,32.45,42.05],alpha=0.5,cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude",fontsize=14)
plt.xlabel("longitude",fontsize=14)
plt.legend(fontsize=15)
plt.title("Meadian House Value by Island")
path_filename=os.path.join(folder_path,"california_plot_island.png")
plt.savefig(path_filename, dpi=300,bbox_inches="tight")
plt.show()
In [84]:
df
Out[84]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity rooms_per_household pop_per_household bedrooms_per_room
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY 6.984127 2.555556 0.146591
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY 6.238137 2.109842 0.155797
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY 8.288136 2.802260 0.129516
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY 5.817352 2.547945 0.184458
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY 6.281853 2.181467 0.172096
... ... ... ... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 78100.0 INLAND 5.045455 2.560606 0.224625
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 77100.0 INLAND 6.114035 3.122807 0.215208
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 92300.0 INLAND 5.205543 2.325635 0.215173
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 84700.0 INLAND 5.329513 2.123209 0.219892
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 89400.0 INLAND 5.254717 2.616981 0.221185

20433 rows × 13 columns

In [86]:
df.median_income.hist(bins=50,figsize=(12,6))
plt.title("Median Income")
plt.show()
In [87]:
# transform a numeric column (df.median_income) into categorical values based on quantiles (percentiles).
pd.qcut(df.median_income,q=[0,0.25,0.50,0.75,1])
Out[87]:
0         (4.744, 15.0]
1         (4.744, 15.0]
2         (4.744, 15.0]
3         (4.744, 15.0]
4        (3.536, 4.744]
              ...      
20635    (0.499, 2.564]
20636    (0.499, 2.564]
20637    (0.499, 2.564]
20638    (0.499, 2.564]
20639    (0.499, 2.564]
Name: median_income, Length: 20433, dtype: category
Categories (4, interval[float64, right]): [(0.499, 2.564] < (2.564, 3.536] < (3.536, 4.744] < (4.744, 15.0]]
In [90]:
df["income_cat"]=pd.qcut(df.median_income,q=[0,0.25,0.50,0.75,0.95,1],
                        labels=["Low","Below_Average","Above_Average","High","Very_High"])
In [92]:
df.income_cat
Out[92]:
0            Very_High
1            Very_High
2                 High
3                 High
4        Above_Average
             ...      
20635              Low
20636              Low
20637              Low
20638              Low
20639              Low
Name: income_cat, Length: 20433, dtype: category
Categories (5, object): ['Low' < 'Below_Average' < 'Above_Average' < 'High' < 'Very_High']
In [93]:
df.income_cat.value_counts(normalize=True)
Out[93]:
Low              0.250037
Above_Average    0.250037
Below_Average    0.249988
High             0.199922
Very_High        0.050017
Name: income_cat, dtype: float64
In [106]:
# plot the categories
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.countplot(data=df,x="income_cat",hue="ocean_proximity")
plt.legend(loc=1)
plt.show()
In [112]:
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.barplot(data=df,x="income_cat",y="median_house_value",dodge=True)
plt.show()
In [113]:
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.barplot(data=df,x="ocean_proximity",y="median_house_value",dodge=True)
plt.show()
In [116]:
df.groupby(["income_cat","ocean_proximity"]).median_house_value.mean().unstack().drop(columns=["ISLAND"])
Out[116]:
ocean_proximity <1H OCEAN INLAND NEAR BAY NEAR OCEAN
income_cat
Low 161337.076923 84820.626650 155122.052133 148027.826514
Below_Average 197236.013829 115124.088283 220196.177656 208665.190096
Above_Average 232278.358759 147846.891351 261965.251582 255293.813584
High 292208.766217 208095.566622 322566.033663 337446.227778
Very_High 439784.235489 347571.736842 451015.078788 468739.723270
In [117]:
matrix=df.groupby(["income_cat","ocean_proximity"]).median_house_value.mean().unstack().drop(columns=["ISLAND"])
In [118]:
matrix.astype("int")
Out[118]:
ocean_proximity <1H OCEAN INLAND NEAR BAY NEAR OCEAN
income_cat
Low 161337 84820 155122 148027
Below_Average 197236 115124 220196 208665
Above_Average 232278 147846 261965 255293
High 292208 208095 322566 337446
Very_High 439784 347571 451015 468739
In [119]:
plt.figure(figsize=(12,6))
sns.set(font_scale=1.5)
sns.heatmap(matrix.astype(int),cmap="Reds",annot=True,fmt="d",vmin=90000,vmax=470000)
plt.show()

Feature Engineering¶

In [120]:
label=df.median_house_value.copy()
label
Out[120]:
0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20433, dtype: float64
In [121]:
features=df.drop(columns=["median_house_value"])
features
Out[121]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity rooms_per_household pop_per_household bedrooms_per_room income_cat
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 NEAR BAY 6.984127 2.555556 0.146591 Very_High
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 NEAR BAY 6.238137 2.109842 0.155797 Very_High
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 NEAR BAY 8.288136 2.802260 0.129516 High
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 NEAR BAY 5.817352 2.547945 0.184458 High
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 NEAR BAY 6.281853 2.181467 0.172096 Above_Average
... ... ... ... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 INLAND 5.045455 2.560606 0.224625 Low
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 INLAND 6.114035 3.122807 0.215208 Low
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 INLAND 5.205543 2.325635 0.215173 Low
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 INLAND 5.329513 2.123209 0.219892 Low
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 INLAND 5.254717 2.616981 0.221185 Low

20433 rows × 13 columns

In [122]:
features.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   longitude            20433 non-null  float64 
 1   latitude             20433 non-null  float64 
 2   housing_median_age   20433 non-null  float64 
 3   total_rooms          20433 non-null  float64 
 4   total_bedrooms       20433 non-null  float64 
 5   population           20433 non-null  float64 
 6   households           20433 non-null  float64 
 7   median_income        20433 non-null  float64 
 8   ocean_proximity      20433 non-null  object  
 9   rooms_per_household  20433 non-null  float64 
 10  pop_per_household    20433 non-null  float64 
 11  bedrooms_per_room    20433 non-null  float64 
 12  income_cat           20433 non-null  category
dtypes: category(1), float64(11), object(1)
memory usage: 2.6+ MB
In [123]:
features.select_dtypes("float")
Out[123]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household pop_per_household bedrooms_per_room
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 6.984127 2.555556 0.146591
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 6.238137 2.109842 0.155797
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 8.288136 2.802260 0.129516
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 5.817352 2.547945 0.184458
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 6.281853 2.181467 0.172096
... ... ... ... ... ... ... ... ... ... ... ...
20635 -121.09 39.48 25.0 1665.0 374.0 845.0 330.0 1.5603 5.045455 2.560606 0.224625
20636 -121.21 39.49 18.0 697.0 150.0 356.0 114.0 2.5568 6.114035 3.122807 0.215208
20637 -121.22 39.43 17.0 2254.0 485.0 1007.0 433.0 1.7000 5.205543 2.325635 0.215173
20638 -121.32 39.43 18.0 1860.0 409.0 741.0 349.0 1.8672 5.329513 2.123209 0.219892
20639 -121.24 39.37 16.0 2785.0 616.0 1387.0 530.0 2.3886 5.254717 2.616981 0.221185

20433 rows × 11 columns

In [124]:
import scipy.stats as stats
In [125]:
feat1=features.select_dtypes("float").apply(lambda x:stats.zscore(x))
feat1
Out[125]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household pop_per_household bedrooms_per_room
0 -1.327314 1.051717 0.982163 -0.803813 -0.970325 -0.973320 -0.976833 2.345163 0.625395 -0.049433 -1.146024
1 -1.322323 1.042355 -0.606210 2.042130 1.348276 0.861339 1.670373 2.332632 0.324942 -0.092134 -0.987254
2 -1.332305 1.037674 1.855769 -0.535189 -0.825561 -0.819769 -0.843427 1.782939 1.150594 -0.025797 -1.440514
3 -1.337296 1.037674 1.855769 -0.623510 -0.718768 -0.765056 -0.733562 0.932970 0.155467 -0.050162 -0.492925
4 -1.337296 1.037674 1.855769 -0.461970 -0.611974 -0.758879 -0.628930 -0.013143 0.342549 -0.085272 -0.706141
... ... ... ... ... ... ... ... ... ... ... ...
20635 -0.758318 1.800677 -0.288535 -0.444580 -0.388895 -0.511787 -0.443207 -1.216727 -0.155420 -0.048949 0.199820
20636 -0.818212 1.805358 -0.844466 -0.887557 -0.920488 -0.943315 -1.008223 -0.692044 0.274959 0.004912 0.037412
20637 -0.823203 1.777272 -0.923885 -0.175042 -0.125472 -0.368826 -0.173778 -1.143171 -0.090943 -0.071460 0.036808
20638 -0.873115 1.777272 -0.844466 -0.355344 -0.305834 -0.603564 -0.393506 -1.055136 -0.041013 -0.090853 0.118204
20639 -0.833186 1.749186 -1.003304 0.067955 0.185416 -0.033487 0.079956 -0.780606 -0.071138 -0.043548 0.140495

20433 rows × 11 columns

In [126]:
pd.options.display.float_format ="{:.2f}".format
In [127]:
feat1
Out[127]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household pop_per_household bedrooms_per_room
0 -1.33 1.05 0.98 -0.80 -0.97 -0.97 -0.98 2.35 0.63 -0.05 -1.15
1 -1.32 1.04 -0.61 2.04 1.35 0.86 1.67 2.33 0.32 -0.09 -0.99
2 -1.33 1.04 1.86 -0.54 -0.83 -0.82 -0.84 1.78 1.15 -0.03 -1.44
3 -1.34 1.04 1.86 -0.62 -0.72 -0.77 -0.73 0.93 0.16 -0.05 -0.49
4 -1.34 1.04 1.86 -0.46 -0.61 -0.76 -0.63 -0.01 0.34 -0.09 -0.71
... ... ... ... ... ... ... ... ... ... ... ...
20635 -0.76 1.80 -0.29 -0.44 -0.39 -0.51 -0.44 -1.22 -0.16 -0.05 0.20
20636 -0.82 1.81 -0.84 -0.89 -0.92 -0.94 -1.01 -0.69 0.27 0.00 0.04
20637 -0.82 1.78 -0.92 -0.18 -0.13 -0.37 -0.17 -1.14 -0.09 -0.07 0.04
20638 -0.87 1.78 -0.84 -0.36 -0.31 -0.60 -0.39 -1.06 -0.04 -0.09 0.12
20639 -0.83 1.75 -1.00 0.07 0.19 -0.03 0.08 -0.78 -0.07 -0.04 0.14

20433 rows × 11 columns

In [128]:
feat1.agg(["mean","std"])
Out[128]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household pop_per_household bedrooms_per_room
mean -0.00 0.00 0.00 -0.00 -0.00 -0.00 -0.00 0.00 -0.00 -0.00 0.00
std 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
In [130]:
# handeling the categorecal column
features.ocean_proximity
Out[130]:
0        NEAR BAY
1        NEAR BAY
2        NEAR BAY
3        NEAR BAY
4        NEAR BAY
           ...   
20635      INLAND
20636      INLAND
20637      INLAND
20638      INLAND
20639      INLAND
Name: ocean_proximity, Length: 20433, dtype: object
In [131]:
features.ocean_proximity.value_counts()
Out[131]:
<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64
In [132]:
dummies= pd.get_dummies(features.ocean_proximity)
In [133]:
dummies
Out[133]:
<1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
0 0 0 0 1 0
1 0 0 0 1 0
2 0 0 0 1 0
3 0 0 0 1 0
4 0 0 0 1 0
... ... ... ... ... ...
20635 0 1 0 0 0
20636 0 1 0 0 0
20637 0 1 0 0 0
20638 0 1 0 0 0
20639 0 1 0 0 0

20433 rows × 5 columns

In [134]:
features = pd.concat([feat1,dummies,df.income_cat],axis=1)
In [135]:
features
Out[135]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household pop_per_household bedrooms_per_room <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN income_cat
0 -1.33 1.05 0.98 -0.80 -0.97 -0.97 -0.98 2.35 0.63 -0.05 -1.15 0 0 0 1 0 Very_High
1 -1.32 1.04 -0.61 2.04 1.35 0.86 1.67 2.33 0.32 -0.09 -0.99 0 0 0 1 0 Very_High
2 -1.33 1.04 1.86 -0.54 -0.83 -0.82 -0.84 1.78 1.15 -0.03 -1.44 0 0 0 1 0 High
3 -1.34 1.04 1.86 -0.62 -0.72 -0.77 -0.73 0.93 0.16 -0.05 -0.49 0 0 0 1 0 High
4 -1.34 1.04 1.86 -0.46 -0.61 -0.76 -0.63 -0.01 0.34 -0.09 -0.71 0 0 0 1 0 Above_Average
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20635 -0.76 1.80 -0.29 -0.44 -0.39 -0.51 -0.44 -1.22 -0.16 -0.05 0.20 0 1 0 0 0 Low
20636 -0.82 1.81 -0.84 -0.89 -0.92 -0.94 -1.01 -0.69 0.27 0.00 0.04 0 1 0 0 0 Low
20637 -0.82 1.78 -0.92 -0.18 -0.13 -0.37 -0.17 -1.14 -0.09 -0.07 0.04 0 1 0 0 0 Low
20638 -0.87 1.78 -0.84 -0.36 -0.31 -0.60 -0.39 -1.06 -0.04 -0.09 0.12 0 1 0 0 0 Low
20639 -0.83 1.75 -1.00 0.07 0.19 -0.03 0.08 -0.78 -0.07 -0.04 0.14 0 1 0 0 0 Low

20433 rows × 17 columns

Splitting the Data into Train and Test Set¶

In [136]:
test_size=0.2
In [137]:
x_test=features.sample(frac=test_size,random_state=123)
In [138]:
x_test
Out[138]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income rooms_per_household pop_per_household bedrooms_per_room <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN income_cat
14354 1.17 -1.35 -0.45 -0.10 0.27 -0.13 0.25 -0.39 -0.56 -0.09 0.99 0 0 0 0 1 Below_Average
12908 -0.86 1.40 -0.37 0.11 -0.03 -0.12 -0.03 -0.29 0.18 -0.04 -0.51 0 1 0 0 0 Below_Average
19545 -0.70 0.93 0.82 -0.13 0.16 0.55 0.28 -1.14 -0.62 0.03 0.76 0 1 0 0 0 Low
12188 1.11 -0.91 -1.40 -0.64 -0.72 -0.74 -0.81 -0.40 0.42 -0.00 -0.36 1 0 0 0 0 Below_Average
14786 1.22 -1.43 -0.61 -0.41 -0.58 -0.39 -0.53 0.50 0.16 0.02 -0.75 0 0 0 0 1 High
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10262 0.85 -0.82 -0.69 0.61 0.01 0.41 0.11 1.43 0.77 0.04 -1.33 1 0 0 0 0 High
3614 0.58 -0.65 0.74 -0.98 -1.04 -0.90 -1.04 -0.39 -0.21 0.09 -0.12 1 0 0 0 0 Below_Average
19296 -1.62 1.30 0.90 -0.11 -0.27 -0.33 -0.19 0.01 0.09 -0.06 -0.64 1 0 0 0 0 Above_Average
5826 0.63 -0.68 1.86 -0.43 -0.62 -0.60 -0.62 0.49 0.43 -0.02 -0.87 1 0 0 0 0 High
15383 1.18 -1.05 -1.16 1.26 0.67 0.82 0.81 0.62 0.49 -0.02 -1.05 1 0 0 0 0 High

4087 rows × 17 columns

In [139]:
x_test.income_cat.value_counts(normalize=True)
Out[139]:
Above_Average   0.25
Below_Average   0.25
Low             0.25
High            0.20
Very_High       0.05
Name: income_cat, dtype: float64
In [140]:
features.income_cat.value_counts(normalize=True)
Out[140]:
Low             0.25
Above_Average   0.25
Below_Average   0.25
High            0.20
Very_High       0.05
Name: income_cat, dtype: float64
In [141]:
x_test.index
Out[141]:
Int64Index([14354, 12908, 19545, 12188, 14786,  9941,  3179,  4650, 15550,
            17190,
            ...
             3992, 10261, 10862, 10863, 13864, 10262,  3614, 19296,  5826,
            15383],
           dtype='int64', length=4087)
In [142]:
x_train =features.loc[~features.index.isin(x_test.index)].copy()
In [144]:
x_train.income_cat.value_counts(normalize=True)
Out[144]:
Low             0.25
Below_Average   0.25
Above_Average   0.25
High            0.20
Very_High       0.05
Name: income_cat, dtype: float64
In [145]:
x_train= x_train.sample(frac=1,random_state=123)
In [146]:
x_train.drop(columns=["income_cat"],inplace=True)
x_test.drop(columns=["income_cat"],inplace=True)
In [147]:
y_train = label.loc[x_train.index]
y_test =label.loc[x_test.index]

Training the ML Model (Random Forest Regressor)¶

In [148]:
from sklearn.ensemble import RandomForestRegressor
In [149]:
forest_reg=RandomForestRegressor(random_state=42,n_estimators=500,
                                max_features="sqrt",max_depth=75,min_samples_split=2)
In [150]:
forest_reg.fit(x_train,y_train)
Out[150]:
RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
                      random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
                      random_state=42)
In [151]:
forest_reg.score(x_train,y_train)
Out[151]:
0.9758470860678036
In [152]:
from sklearn.metrics import mean_squared_error
In [153]:
pred = forest_reg.predict(x_train)
pred
Out[153]:
array([238374.608, 246813.8  ,  74191.4  , ..., 184930.402, 117595.4  ,
       187186.008])
In [154]:
forest_mse = mean_squared_error(y_train,pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
Out[154]:
18023.671210966968

Evaluating the Model on the Test Set¶

In [155]:
forest_reg
Out[155]:
RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
                      random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(max_depth=75, max_features='sqrt', n_estimators=500,
                      random_state=42)
In [156]:
forest_reg.score(x_test,y_test)
Out[156]:
0.825152593253362
In [157]:
pred =forest_reg.predict(x_test)
pred
Out[157]:
array([224965.808, 129246.8  ,  67903.   , ..., 222406.402, 322985.02 ,
       268692.8  ])
In [158]:
forest_mse = mean_squared_error(y_test,pred)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
Out[158]:
47348.34022326726
In [159]:
comp = pd.DataFrame(data={"True_v":y_test,"Pred":pred})
In [160]:
comp
Out[160]:
True_v Pred
14354 101800.00 224965.81
12908 213000.00 129246.80
19545 58800.00 67903.00
12188 111300.00 187250.02
14786 174100.00 160228.40
... ... ...
10262 272200.00 296562.20
3614 175900.00 183863.20
19296 215900.00 222406.40
5826 326100.00 322985.02
15383 291500.00 268692.80

4087 rows × 2 columns

In [162]:
ae=comp.True_v.sub(comp.Pred).abs()
ae
Out[162]:
14354   123165.81
12908    83753.20
19545     9103.00
12188    75950.02
14786    13871.60
           ...   
10262    24362.20
3614      7963.20
19296     6506.40
5826      3114.98
15383    22807.20
Length: 4087, dtype: float64
In [163]:
mae = ae.mean()
In [164]:
mae
Out[164]:
31722.632696354292

Feature Importance¶

In [165]:
forest_reg.feature_importances_
Out[165]:
array([8.43251997e-02, 7.63824246e-02, 4.20571593e-02, 2.28712182e-02,
       2.01285036e-02, 2.28577613e-02, 1.96337714e-02, 2.80813190e-01,
       6.50088158e-02, 9.89248218e-02, 9.71199460e-02, 1.91328297e-02,
       1.36700488e-01, 2.65441330e-04, 5.35959721e-03, 8.41883258e-03])
In [166]:
feature_imp = pd.Series(data=forest_reg.feature_importances_,
                       index=x_train.columns).sort_values(ascending=False)
In [167]:
feature_imp
Out[167]:
median_income         0.28
INLAND                0.14
pop_per_household     0.10
bedrooms_per_room     0.10
longitude             0.08
latitude              0.08
rooms_per_household   0.07
housing_median_age    0.04
total_rooms           0.02
population            0.02
total_bedrooms        0.02
households            0.02
<1H OCEAN             0.02
NEAR OCEAN            0.01
NEAR BAY              0.01
ISLAND                0.00
dtype: float64
In [169]:
feature_imp.sort_values().plot.barh(figsize=(12,6))
plt.show()
In [ ]: